library(ggplot2)
library(stringi)
library(gridExtra)
library(dendextend)
library(kableExtra)
library(limma)
library(psych)
library(tidyverse)
library(CONSTANd)  # install from source: https://github.com/PDiracDelta/CONSTANd/
source('other_functions.R')
source('plotting_functions.R')
data.list <- readRDS(params$input_data_p)
dat.l <- data.list$dat.l # data in long format
dat.w <- data.list$dat.w # data in wide format
# which proteins were spiked in?
spiked.proteins <- dat.l %>% distinct(Protein) %>% filter(stri_detect(Protein, fixed='ups')) %>% pull %>% as.character
tmp=dat.l %>% distinct(Protein) %>% pull %>% as.character
# protein subsampling
if (params$subsample_p>0 & params$subsample_p==floor(params$subsample_p) & params$subsample_p<=length(tmp)){
  sub.prot <- tmp[sample(1:length(tmp), size=params$subsample_p)]
  if (length(spiked.proteins)>0) sub.prot <- c(sub.prot,spiked.proteins)
  dat.l <- dat.l %>% filter(Protein %in% sub.prot)
  dat.w <- dat.w %>% filter(Protein %in% sub.prot)
}
# specify # of varying component variants and their names
variant.names <- c('log2_intensity', 'intensity', 'ratio_rowmean','ratio_condmean','ratio_condpairs','ratio_onetag')
n.comp.variants <- length(variant.names)
scale.vec <- c('log', 'raw', 'raw','raw','raw','raw')  # ratios are considered raw, because they are basically mean-normalized intensities

# get some data parameters created in the data_prep script
referenceCondition <- data.list$data.params$referenceCondition
channelsOrdered <- data.list$data.params$channelsOrdered
condition.color <- data.list$data.params$condition.color
ma.onesample.num <- data.list$data.params$ma.onesample.num
ma.onesample.denom <- data.list$data.params$ma.onesample.denom
ma.allsamples.num <- data.list$data.params$ma.allsamples.num
ma.allsamples.denom <- data.list$data.params$ma.allsamples.denom
# create data frame with sample information
sample.info <- get_sample_info(dat.l, condition.color)
# get channel names
channelNames <- remove_factors(unique(sample.info$Channel))

1 Unit scale component

dat.unit.l <- emptyList(variant.names)

1.1 log2 transformation of reporter ion intensities

dat.unit.l$log2_intensity <- dat.l %>% mutate(response=log2(intensity)) %>% select(-intensity)

1.2 intensities on the original scale

dat.unit.l$intensity <- dat.l %>% rename(response=intensity)

1.3 intensity ratios

ratio_rowmean: Calculate ratio (per PSM) with respect to average intensity within run, or in other words: each value is divided by the row mean.

# use half-wide data to compute within-run average of PSM channels corresponding to the reference Condition
refCols <- sample.info %>% distinct(Channel) %>% pull(Channel)
denom.df=dat.l %>% pivot_wider(id_cols=-one_of('Condition', 'BioReplicate'),names_from='Channel', values_from='intensity')
denom.df$denom=apply(denom.df[,refCols], 1, function(x) mean(x, na.rm=T))
denom.df=denom.df[,c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM', 'denom')]
dat.unit.l$ratio_rowmean <- dat.l %>% left_join(denom.df, by=c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM')) %>% mutate(response=intensity/denom) %>% select(-c(intensity, denom)) 

ratio_condmean: each sample is divided by the average of samples belonging to the reference condition (here ‘r referenceCondition’)

# use half-wide data to compute within-run average of PSM channels corresponding to the reference Condition
refCols <- sample.info %>% filter(Condition==referenceCondition) %>% distinct(Channel) %>% pull
denom.df=dat.l %>% filter(Condition==referenceCondition) %>% pivot_wider(id_cols=-one_of('Condition', 'BioReplicate'),names_from='Channel', values_from='intensity')
denom.df$denom=apply(denom.df[,refCols], 1, function(x) mean(x, na.rm=T))
denom.df=denom.df[,c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM', 'denom')]
dat.unit.l$ratio_condmean <- dat.l %>% left_join(denom.df, by=c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM')) %>% mutate(response=intensity/denom) %>% select(-c(intensity, denom)) 

ratio_condpairs: compute ratios using samples from a selected condition as ratio denominator (channels paired from left to right)

dat.unit.l$ratio_condpairs <- compute_ratio(dat.w, sample.info, channelsOrdered, referenceCondition)

ratio_onetag: divide by one channel reserved for the reference condition (here the first channel of 0.5)

refCols <- sample.info %>% filter(Condition %in% referenceCondition) %>% group_by(Run) %>% top_n(1, Channel) %>% pull(Sample) %>% as.character()
left_data=dat.unit.l$intensity %>% filter(paste(Run, Channel, sep=':') %in% refCols) %>% select('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM', 'response') %>% rename(denom=response)
dat.unit.l$ratio_onetag=left_join(dat.unit.l$intensity, left_data, by=c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM'))
dat.unit.l$ratio_onetag <- dat.unit.l$ratio_onetag %>% mutate(response=response/denom) %>% select(-denom)

2 Normalization component: medianSweeping (1)

# switch to wide format
dat.unit.w <- lapply(dat.unit.l, function(x){
  pivot_wider(data = x, id_cols=-one_of(c('Condition', 'BioReplicate')), names_from=Channel, values_from=response)})
# subtract the spectrum median log2intensity from the observed log2intensities
# subtract the spectrum median log2intensity from the observed log2intensities
dat.norm.w <- dat.unit.w
dat.norm.w$log2_intensity[,channelNames] <- median_sweep(dat.norm.w$log2_intensity[,channelNames], 1, '-')
dat.norm.w$intensity[,channelNames] <- median_sweep(dat.norm.w$intensity[,channelNames], 1, '/')
# dat.norm.w$ratio_rowmean[,channelNames] <- median_sweep(dat.norm.w$ratio_rowmean[,channelNames], 1, '/')
# dat.norm.w$ratio_condmean[,channelNames] <- median_sweep(dat.norm.w$ratio_condmean[,channelNames], 1, '/')
# dat.norm.w$ratio_condpairs[,channelNames] <- median_sweep(dat.norm.w$ratio_condpairs[,channelNames], 1, '/')
# dat.norm.w$ratio_onetag[,channelNames] <- median_sweep(dat.norm.w$ratio_onetag[,channelNames], 1, '/')

3 Summarization component: Median summarization

Summarize quantification values from PSM to peptide (first step) to protein (second step).

# normalized data
dat.norm.summ.w <- lapply(dat.norm.w, function(x){
  y <- x %>% group_by(Run, Protein, Peptide) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% ungroup()
  return(y)})

Let’s also summarize the non-normalized data for comparison in the next section.

# non-normalized data
# group by (run,)protein,peptide then summarize twice (once on each level)
# add select() statement because summarise_at is going bananas over character columns
dat.nonnorm.summ.w <- lapply(dat.unit.w, function(x) {
  y <- x %>% group_by(Run, Protein, Peptide) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% ungroup()
  return(y)})

4 Normalization component: medianSweeping (2)

# medianSweeping: in each channel, subtract/divide median computed across all proteins within the channel
# do the above separately for each MS run
second_median_sweep <- function(dat, fun){
  dat.split <- split(dat, dat$Run) 
  dat.split.norm  <- lapply(dat.split, function(y) {
    y[,channelNames] <- median_sweep(y[,channelNames], 2, fun); return(y)})
  return(bind_rows(dat.split.norm))
}
dat.norm.summ.w$log2_intensity <- second_median_sweep(dat.norm.summ.w$log2_intensity, '-')
dat.norm.summ.w$intensity <- second_median_sweep(dat.norm.summ.w$intensity, '/')
# dat.norm.summ.w$ratio_rowmean <- second_median_sweep(dat.norm.summ.w$ratio_rowmean, '/')
# dat.norm.summ.w$ratio_condmean <- second_median_sweep(dat.norm.summ.w$ratio_condmean, '/')
# dat.norm.summ.w$ratio_condpairs <- second_median_sweep(dat.norm.summ.w$ratio_condpairs, '/')
# dat.norm.summ.w$ratio_onetag <- second_median_sweep(dat.norm.summ.w$ratio_onetag, '/')

5 QC plots

# make data completely wide (also across runs)
## normalized data
dat.norm.summ.w2 <- lapply(dat.norm.summ.w, function(x) {
  return( x %>% pivot_wider(names_from = Run, values_from = all_of(channelNames), names_glue = "{Run}:{.value}") )
})

## non-normalized data
dat.nonnorm.summ.w2 <- lapply(dat.nonnorm.summ.w, function(x) {
  return( x %>% pivot_wider(names_from = Run, values_from = all_of(channelNames), names_glue = "{Run}:{.value}") )
})

5.1 Boxplots

# use (half-)wide format
par(mfrow=c(1,2))
for (i in 1:n.comp.variants){
  boxplot_w(dat.nonnorm.summ.w[[variant.names[i]]],sample.info, paste('raw', variant.names[i], sep='_'))
  boxplot_w(dat.norm.summ.w[[variant.names[i]]], sample.info, paste('normalized', variant.names[i], sep='_'))}

5.2 MA plots

MA plots of two single samples taken from condition 1 and condition 0.125, measured in different MS runs (samples Mixture2_1:127C and Mixture1_2:129N, respectively).

for (i in 1:n.comp.variants){
  p1 <- maplot_ils(dat.nonnorm.summ.w2[[variant.names[i]]], ma.onesample.num, ma.onesample.denom, scale=scale.vec[i], paste('raw', variant.names[i], sep='_'), spiked.proteins)
  p2 <- maplot_ils(dat.norm.summ.w2[[variant.names[i]]], ma.onesample.num, ma.onesample.denom, scale=scale.vec[i], paste('normalized', variant.names[i], sep='_'), spiked.proteins)
grid.arrange(p1, p2, ncol=2)}

MA plots of all samples from condition 1 and condition 0.125 (quantification values averaged within condition).

channels.num <- sample.info %>% filter(Condition==ma.allsamples.num) %>% select(Sample) %>% pull
channels.denom <- sample.info %>% filter(Condition==ma.allsamples.denom) %>% select(Sample) %>% pull
for (i in 1:n.comp.variants){
  p1 <- maplot_ils(dat.nonnorm.summ.w2[[variant.names[i]]], channels.num, channels.denom, scale=scale.vec[i], paste('raw', variant.names[i], sep='_'), spiked.proteins)
  p2 <- maplot_ils(dat.norm.summ.w2[[variant.names[i]]], channels.num, channels.denom, scale=scale.vec[i], paste('normalized', variant.names[i], sep='_'), spiked.proteins)
grid.arrange(p1, p2, ncol=2)}

5.3 PCA plots

5.3.1 Using all proteins

par(mfrow=c(1, 2))
for (i in 1:n.comp.variants){
  if (variant.names[i]=='intensity') pca.scale=TRUE else pca.scale=FALSE
  pcaplot_ils(dat.nonnorm.summ.w2[[variant.names[i]]] %>% select(-'Protein'), info=sample.info, paste('raw', variant.names[i], sep='_'), scale=pca.scale)
  pcaplot_ils(dat.norm.summ.w2[[variant.names[i]]] %>% select(-'Protein'), info=sample.info, paste('normalized', variant.names[i], sep='_'))}

5.3.2 Using spiked proteins only (if applicable)

par(mfrow=c(1, 2))
for (i in 1:n.comp.variants){
  if (variant.names[i]=='intensity') pcaplot_ils(dat.nonnorm.summ.w2[[variant.names[i]]] %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, paste('raw', variant.names[i], sep='_'), scale=T) else pcaplot_ils(dat.nonnorm.summ.w2[[variant.names[i]]] %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, paste('raw', variant.names[i], sep='_'))
  pcaplot_ils(dat.norm.summ.w2[[variant.names[i]]] %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, paste('normalized', variant.names[i], sep='_'))}

5.4 HC (hierarchical clustering) plots

5.4.1 Using all proteins

par(mfrow=c(1,2))
for (i in 1:n.comp.variants){
  dendrogram_ils(dat.nonnorm.summ.w2[[variant.names[i]]] %>% select(-Protein), info=sample.info, paste('raw', variant.names[i], sep='_'))
  dendrogram_ils(dat.norm.summ.w2[[variant.names[i]]] %>% select(-Protein), info=sample.info, paste('normalized', variant.names[i], sep='_'))}

5.5 Run effect p-value plot

Below the histograms of p-values from the linear model explaining the response variable with Run as a covariate.

plots <- vector('list', n.comp.variants)
for (i in 1:n.comp.variants){
dat <- list(dat.nonnorm.summ.l[[variant.names[i]]], dat.norm.summ.l[[variant.names[i]]])
names(dat) <- c(paste('raw', variant.names[i], sep='_'), paste('normalized', variant.names[i], sep='_'))
plots[[i]] <- run_effect_plot(dat)}
grid.arrange(grobs = plots, nrow=n.comp.variants)

6 DEA component: Moderated t-test

# remove ref channels for DEA
# excl.sample1 <- sample.info %>% filter(Condition==ratioCondition) %>% pull(Sample) %>% as.character
# excl.sample2 <- sample.info %>% filter(stri_detect(Sample, fixed=refChannel)) %>% pull(Sample) %>% as.character
# dat.norm.summ.w2$ratio_condpairs <- dat.norm.summ.w2$ratio_condpairs %>% select(-all_of(excl.sample1))
# dat.norm.summ.w2$ratio_onetag <- dat.norm.summ.w2$ratio_onetag %>% select(-all_of(excl.sample2))

NOTE: - actually, lmFit (used in moderated_ttest) was built for log2-transformed data. However, supplying untransformed intensities can also work. This just means that the effects in the linear model are also additive on the untransformed scale, whereas for log-transformed data they are multiplicative on the untransformed scale. Also, there may be a bias which occurs from biased estimates of the population means in the t-tests, as mean(X) is not equal to exp(mean(log(X))).

#{ INVESTIGATE late log2 transform
dat.norm.summ.w2$intensity_lateLog2 <- dat.norm.summ.w2$intensity
channelNames.prefixed <- colnames(dat.norm.summ.w2$intensity %>% select(-Protein))
dat.norm.summ.w2$intensity_lateLog2[,channelNames.prefixed] <- log2(dat.norm.summ.w2$intensity[,channelNames.prefixed])
variant.names <- names(dat.norm.summ.w2)
scale.vec <- c(scale.vec, 'log')
n.comp.variants <- n.comp.variants + 1
#}
design.matrix <- get_design_matrix(referenceCondition, sample.info)
dat.dea <- emptyList(names(dat.norm.summ.w2))
for (i in 1:n.comp.variants) {
  this_scale <- scale.vec[match(names(dat.dea)[i], variant.names)]
  d <- column_to_rownames(as.data.frame(dat.norm.summ.w2[[variant.names[i]]]), 'Protein')
  dat.dea[[variant.names[i]]] <- moderated_ttest(dat=d, design.matrix, scale=this_scale)
}

7 Results comparison

7.1 Confusion matrix

cm <- conf_mat(dat.dea, 'q.mod', 0.05, spiked.proteins)
print_conf_mat(cm, referenceCondition)
0.125 vs 0.5 contrast
log2_intensity
intensity
ratio_rowmean
ratio_condmean
ratio_condpairs
ratio_onetag
intensity_lateLog2
background spiked background spiked background spiked background spiked background spiked background spiked background spiked
not_DE 4061 4 4062 6 4064 19 4064 19 4064 19 4064 19 4061 4
DE 3 15 2 13 0 0 0 0 0 0 0 0 3 15
0.125 vs 0.5 contrast
log2_intensity intensity ratio_rowmean ratio_condmean ratio_condpairs ratio_onetag intensity_lateLog2
Accuracy 0.998 0.998 0.995 0.995 0.995 0.995 0.998
Sensitivity 0.789 0.684 0.000 0.000 0.000 0.000 0.789
Specificity 0.999 1.000 1.000 1.000 1.000 1.000 0.999
PPV 0.833 0.867 NaN NaN NaN NaN 0.833
NPV 0.999 0.999 0.995 0.995 0.995 0.995 0.999
0.667 vs 0.5 contrast
log2_intensity
intensity
ratio_rowmean
ratio_condmean
ratio_condpairs
ratio_onetag
intensity_lateLog2
background spiked background spiked background spiked background spiked background spiked background spiked background spiked
not_DE 4064 18 4063 15 4064 19 4060 19 4064 19 4064 19 4064 18
DE 0 1 1 4 0 0 4 0 0 0 0 0 0 1
0.667 vs 0.5 contrast
log2_intensity intensity ratio_rowmean ratio_condmean ratio_condpairs ratio_onetag intensity_lateLog2
Accuracy 0.996 0.996 0.995 0.994 0.995 0.995 0.996
Sensitivity 0.053 0.211 0.000 0.000 0.000 0.000 0.053
Specificity 1.000 1.000 1.000 0.999 1.000 1.000 1.000
PPV 1.000 0.800 NaN 0.000 NaN NaN 1.000
NPV 0.996 0.996 0.995 0.995 0.995 0.995 0.996
1 vs 0.5 contrast
log2_intensity
intensity
ratio_rowmean
ratio_condmean
ratio_condpairs
ratio_onetag
intensity_lateLog2
background spiked background spiked background spiked background spiked background spiked background spiked background spiked
not_DE 4060 5 4058 3 4060 8 4060 10 4064 19 4064 19 4060 5
DE 4 14 6 16 4 11 4 9 0 0 0 0 4 14
1 vs 0.5 contrast
log2_intensity intensity ratio_rowmean ratio_condmean ratio_condpairs ratio_onetag intensity_lateLog2
Accuracy 0.998 0.998 0.997 0.997 0.995 0.995 0.998
Sensitivity 0.737 0.842 0.579 0.474 0.000 0.000 0.737
Specificity 0.999 0.999 0.999 0.999 1.000 1.000 0.999
PPV 0.778 0.727 0.733 0.692 NaN NaN 0.778
NPV 0.999 0.999 0.998 0.998 0.995 0.995 0.999

7.2 Scatter plots

# character vectors containing logFC and p-values columns
dea.cols <- colnames(dat.dea[[1]])
logFC.cols <- dea.cols[stri_detect_fixed(dea.cols, 'logFC')]
significance.cols <- dea.cols[stri_detect_fixed(dea.cols, 'q.mod')]
n.contrasts <- length(logFC.cols)

scatterplot_ils(dat.dea, significance.cols, 'q-values', spiked.proteins, referenceCondition)

scatterplot_ils(dat.dea, logFC.cols, 'log2FC', spiked.proteins, referenceCondition)

7.3 Volcano plots

for (i in 1:n.contrasts){
  volcanoplot_ils(dat.dea, i, spiked.proteins, referenceCondition)}

7.4 Violin plots

Let’s see whether the spiked protein fold changes make sense

# plot theoretical value (horizontal lines) and violin per variant
violinplot_ils(lapply(dat.dea, function(x) x[spiked.proteins, logFC.cols]), referenceCondition)

8 Conclusions

9 Session information

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18363)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252  LC_CTYPE=English_United Kingdom.1252   
## [3] LC_MONETARY=English_United Kingdom.1252 LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.1252    
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] CONSTANd_0.99.0   forcats_0.5.0     stringr_1.4.0     dplyr_1.0.2       purrr_0.3.4      
##  [6] readr_1.4.0       tidyr_1.1.2       tibble_3.0.4      tidyverse_1.3.0   psych_2.0.9      
## [11] limma_3.44.3      kableExtra_1.3.1  dendextend_1.14.0 gridExtra_2.3     stringi_1.5.3    
## [16] ggplot2_3.3.2     rmarkdown_2.5     pacman_0.5.1     
## 
## loaded via a namespace (and not attached):
##   [1] ModelMetrics_1.2.2.2  R.methodsS3_1.8.1     knitr_1.30            multcomp_1.4-14      
##   [5] R.utils_2.10.1        data.table_1.13.2     rpart_4.1-15          doParallel_1.0.16    
##   [9] generics_0.1.0        BiocGenerics_0.34.0   preprocessCore_1.50.0 TH.data_1.0-10       
##  [13] webshot_0.5.2         xml2_1.3.2            lubridate_1.7.9       assertthat_0.2.1     
##  [17] viridis_0.5.1         gower_0.2.2           xfun_0.19             hms_0.5.3            
##  [21] evaluate_0.14         fansi_0.4.1           dbplyr_2.0.0          readxl_1.3.1         
##  [25] DBI_1.1.0             tmvnsim_1.0-2         ellipsis_0.3.1        backports_1.1.10     
##  [29] signal_0.7-6          libcoin_1.0-6         vctrs_0.3.4           Biobase_2.48.0       
##  [33] caret_6.0-86          withr_2.3.0           mnormt_2.0.2          crayon_1.3.4         
##  [37] recipes_0.1.15        pkgconfig_2.0.3       labeling_0.4.2        nlme_3.1-149         
##  [41] ProtGenerics_1.20.0   nnet_7.3-14           rlang_0.4.8           lifecycle_0.2.0      
##  [45] sandwich_3.0-0        affyio_1.58.0         modelr_0.1.8          cellranger_1.1.0     
##  [49] matrixStats_0.57.0    Matrix_1.2-18         boot_1.3-25           zoo_1.8-8            
##  [53] reprex_0.3.0          png_0.1-7             viridisLite_0.3.0     R.oo_1.24.0          
##  [57] pROC_1.16.2           S4Vectors_0.26.1      scales_1.1.1          magrittr_1.5         
##  [61] plyr_1.8.6            zlibbioc_1.34.0       compiler_4.0.3        pcaMethods_1.80.0    
##  [65] cli_2.1.0             affy_1.66.0           MASS_7.3-53           mgcv_1.8-33          
##  [69] tidyselect_1.1.0      vsn_3.56.0            highr_0.8             yaml_2.2.1           
##  [73] MALDIquant_1.19.3     grid_4.0.3            tools_4.0.3           rstudioapi_0.12      
##  [77] foreach_1.5.1         prodlim_2019.11.13    farver_2.0.3          mzID_1.26.0          
##  [81] digest_0.6.26         BiocManager_1.30.10   lava_1.6.8.1          Rcpp_1.0.5           
##  [85] broom_0.7.2           ncdf4_1.17            httr_1.4.2            colorspace_1.4-1     
##  [89] rvest_0.3.6           XML_3.99-0.5          fs_1.5.0              IRanges_2.22.2       
##  [93] splines_4.0.3         statmod_1.4.35        jsonlite_1.7.1        nloptr_1.2.2.2       
##  [97] timeDate_3043.102     modeltools_0.2-23     ipred_0.9-9           R6_2.5.0             
## [101] pillar_1.4.6          htmltools_0.5.0       glue_1.4.2            minqa_1.2.4          
## [105] BiocParallel_1.22.0   class_7.3-17          codetools_0.2-16      mvtnorm_1.1-1        
## [109] utf8_1.1.4            lattice_0.20-41       numDeriv_2016.8-1.1   survival_3.2-7       
## [113] admisc_0.11           munsell_0.5.0         e1071_1.7-4           iterators_1.0.13     
## [117] impute_1.62.0         haven_2.3.1           reshape2_1.4.4        gtable_0.3.0